Part I: Preface of Dataset

The following dataset comes from a 2014 survey conducted by Open Sourcing Mental Health (formerly OSMI), aiming to measure tech professional’s attitudes towards mental health and the frequency of mental health disorders within the field of technology.
# load in necessary libraries
library(readr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RColorBrewer)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Loading required package: viridisLite
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(ggcorrplot)
library(reshape2)
library(relaimpo)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: boot
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
# read in dataset
mental_health1 <- read.csv("/Users/kayleetringali/STAT442 Final/survey_2014.csv")
summary(mental_health1)
##   Timestamp              Age                Gender            Country         
##  Length:1259        Min.   :-1.726e+03   Length:1259        Length:1259       
##  Class :character   1st Qu.: 2.700e+01   Class :character   Class :character  
##  Mode  :character   Median : 3.100e+01   Mode  :character   Mode  :character  
##                     Mean   : 7.943e+07                                        
##                     3rd Qu.: 3.600e+01                                        
##                     Max.   : 1.000e+11                                        
##     state           self_employed      family_history      treatment        
##  Length:1259        Length:1259        Length:1259        Length:1259       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  work_interfere     no_employees       remote_work        tech_company      
##  Length:1259        Length:1259        Length:1259        Length:1259       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    benefits         care_options       wellness_program    seek_help        
##  Length:1259        Length:1259        Length:1259        Length:1259       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   anonymity            leave           mental_health_consequence
##  Length:1259        Length:1259        Length:1259              
##  Class :character   Class :character   Class :character         
##  Mode  :character   Mode  :character   Mode  :character         
##                                                                 
##                                                                 
##                                                                 
##  phys_health_consequence  coworkers          supervisor       
##  Length:1259             Length:1259        Length:1259       
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##  mental_health_interview phys_health_interview mental_vs_physical
##  Length:1259             Length:1259           Length:1259       
##  Class :character        Class :character      Class :character  
##  Mode  :character        Mode  :character      Mode  :character  
##                                                                  
##                                                                  
##                                                                  
##  obs_consequence      comments        
##  Length:1259        Length:1259       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

A: preprocessing and cleaning the data

# right off the bat, the values for age seem inconsistent
mental_health2 <- mental_health1 %>%
  filter(Age >= 18 & Age <= 65)
summary(mental_health2)
##   Timestamp              Age           Gender            Country         
##  Length:1250        Min.   :18.00   Length:1250        Length:1250       
##  Class :character   1st Qu.:27.00   Class :character   Class :character  
##  Mode  :character   Median :31.00   Mode  :character   Mode  :character  
##                     Mean   :32.04                                        
##                     3rd Qu.:36.00                                        
##                     Max.   :65.00                                        
##     state           self_employed      family_history      treatment        
##  Length:1250        Length:1250        Length:1250        Length:1250       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  work_interfere     no_employees       remote_work        tech_company      
##  Length:1250        Length:1250        Length:1250        Length:1250       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    benefits         care_options       wellness_program    seek_help        
##  Length:1250        Length:1250        Length:1250        Length:1250       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   anonymity            leave           mental_health_consequence
##  Length:1250        Length:1250        Length:1250              
##  Class :character   Class :character   Class :character         
##  Mode  :character   Mode  :character   Mode  :character         
##                                                                 
##                                                                 
##                                                                 
##  phys_health_consequence  coworkers          supervisor       
##  Length:1250             Length:1250        Length:1250       
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##  mental_health_interview phys_health_interview mental_vs_physical
##  Length:1250             Length:1250           Length:1250       
##  Class :character        Class :character      Class :character  
##  Mode  :character        Mode  :character      Mode  :character  
##                                                                  
##                                                                  
##                                                                  
##  obs_consequence      comments        
##  Length:1250        Length:1250       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
# first, check for missing data
if(sum(is.na(mental_health2)) == 0) {
  print("There is no missing data in our dataset") # displays count of missing values if NA values are present
} else {
  print(paste("There are", sum(is.na(mental_health2)), "missing values in this dataset"))
}
## [1] "There are 1882 missing values in this dataset"
# next, calculate missing values, unique features, and data types for each column
frame <- data.frame(
  Missing_Values = colSums(is.na(mental_health2)),
  Unique_Features = sapply(mental_health2, function(x) length(unique(x))),
  Data_Types = sapply(mental_health2, class)
)
print(frame)
##                           Missing_Values Unique_Features Data_Types
## Timestamp                              0            1238  character
## Age                                    0              44    numeric
## Gender                                 0              46  character
## Country                                0              46  character
## state                                513              46  character
## self_employed                         18               3  character
## family_history                         0               2  character
## treatment                              0               2  character
## work_interfere                       262               5  character
## no_employees                           0               6  character
## remote_work                            0               2  character
## tech_company                           0               2  character
## benefits                               0               3  character
## care_options                           0               3  character
## wellness_program                       0               3  character
## seek_help                              0               3  character
## anonymity                              0               3  character
## leave                                  0               5  character
## mental_health_consequence              0               3  character
## phys_health_consequence                0               3  character
## coworkers                              0               3  character
## supervisor                             0               3  character
## mental_health_interview                0               3  character
## phys_health_interview                  0               3  character
## mental_vs_physical                     0               3  character
## obs_consequence                        0               2  character
## comments                            1089             158  character
# view unique values in the 'work_interfere' column
unique_values <- unique(mental_health2$work_interfere)

print(unique_values)
## [1] "Often"     "Rarely"    "Never"     "Sometimes" NA

B: Work interference has lots of missing values

# define the number of colors needed (assuming 'mental_health2' is your dataset)
num_colors <- length(unique(mental_health2$work_interfere))

# choose a qualitative color palette from RColorBrewer
color_palette <- brewer.pal(num_colors, "Set2")

# create a bar plot with the chosen color palette and without a legend
plot1 <- ggplot(mental_health2, aes(x = work_interfere, fill = work_interfere)) +
  geom_bar() +
  scale_fill_manual(values = color_palette, guide = "none") +
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5, size = 3.5) +
  labs(x = "Work Interference", title = "Work Interference Distribution") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# instead of entirely removing observations with NAs in the work interference and column, fill NAs based on most frequent selection
mental_health3 <- mental_health2 %>%
  mutate(work_interfere = ifelse(is.na(work_interfere),
                                 names(sort(table(work_interfere), decreasing = TRUE)[1]),
                                 work_interfere),
         self_employed = ifelse(is.na(self_employed),
                                names(sort(table(self_employed), decreasing = TRUE)[1]),
                                self_employed))

# create a bar plot for 'work_interfere' column
plot2 <- ggplot(mental_health3, aes(x = work_interfere, fill = work_interfere)) +
  geom_bar() +
  scale_fill_manual(values = color_palette, guide = "none") +
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5, size = 3.5) +
  labs(x = "Work Interference", title = "Updated Work Interference Distribution") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# display the plots
plot1

plot2

C: Condensing ‘Gender’ into 3 categories for simplicity and viewing the distribution

# replace various gender responses with standardized categories
mental_health2$Gender <- gsub("(?i)^(?=.*\\b(?:male|m|maile|malr|msle|make)\\b).*$", "Male", mental_health2$Gender, perl = TRUE)
mental_health2$Gender <- gsub("(?i)^(?=.*\\b(?:female|f|femake|mal)\\b).*$", "Female", mental_health2$Gender, perl = TRUE)
mental_health2$Gender[mental_health2$Gender == "Guy (-ish) ^_^"] <- "Other"  # replace exact string value

# remove leading/trailing whitespaces
mental_health2$Gender <- trimws(mental_health2$Gender)

# convert non-Male/Female entries to 'Other'
mental_health2$Gender[!(mental_health2$Gender %in% c("Male", "Female"))] <- "Other"

# create a table with counts of each gender category
gender_counts <- table(mental_health2$Gender)

# convert the table into a data frame
gender_counts_df <- as.data.frame(gender_counts)
names(gender_counts_df) <- c("Gender", "Count")

# create a pie chart using plotly
gender_fig <- plot_ly(gender_counts_df, labels = ~Gender, values = ~Count, type = 'pie', hole = 0.4) %>%
  layout(title = "Distribution of Gender Categories", x = 0.5, font = list(size = 13, color = "black", family = "Arial", weight = "bold"))
# show the plot
gender_fig
## Warning: 'layout' objects don't have these attributes: 'x'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
# create a table with counts of each age range
age_counts <- table(mental_health2$Age)

# convert the table into a data frame
age_counts_df <- as.data.frame(age_counts)
names(age_counts_df) <- c("Age", "Count")

# create a gradient fill based on the count values
ggplot(age_counts_df, aes(x = Age, y = Count, fill = Count)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "yellow", high = "red") +  # Define the gradient colors
  labs(title = "Age Distribution", x = "Age", y = "Count") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"))

# create a kernel density plot segmented by treatment categories
ggplot(data = mental_health2, aes(x = Age, fill = treatment)) +
  geom_density(alpha = 0.5) +
  labs(title = "Kernel Density Plot of Age by Treatment", x = "Age", y = "Density") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  scale_fill_manual(name = "Treatment Status",  # Change the legend title
                    values = c("Yes" = "seagreen", "No" = "purple"))  # modify legend colors

# filter data where treatment is 'Yes'
treated_countries <- mental_health2 %>%
  filter(treatment == 'Yes') %>%
  group_by(Country) %>%
  summarise(count = n()) %>%
  top_n(10, count) %>%
  arrange(desc(count))

# filter data where treatment is 'No'
not_treated_countries <- mental_health2 %>%
  filter(treatment == 'No') %>%
  group_by(Country) %>%
  summarise(count = n()) %>%
  top_n(10, count) %>%
  arrange(desc(count))

# select top 10 treating and not treating countries
treated_countries <- head(treated_countries, 10)
not_treated_countries <- head(not_treated_countries, 10)

# merge the treated and not treated country data
all_countries <- rbind(
  transform(treated_countries, treatment_status = "Treating"),
  transform(not_treated_countries, treatment_status = "Not Treating")
)

# order the merged dataframe by count
all_countries <- all_countries[order(all_countries$count),]

# Create a bar plot for treating and not treating countries
ggplot(all_countries, aes(x = reorder(Country, count), y = count, fill = treatment_status)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.6) +
  labs(title = "Top 10 Countries - Treating vs. Not Treating Mental Health Issues", 
       x = "Country", y = "Frequency") +
  scale_fill_manual(name = "Treatment Status", values = c("Treating" = "seagreen", "Not Treating" = "purple")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  theme(
    legend.title = element_text(face = NULL),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )

# create a bar plot of anticipated mental health consequences and treatment

# create custom color palette
custom_colors <- c("seagreen", "skyblue")

# create the plot with customizations
ggplot(mental_health2, aes(x = mental_health_consequence, fill = factor(treatment))) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = custom_colors, name = "Treatment") +
  labs(x = "Mental Health Consequence", y = "Frequency", title = "Frequency of Mental Health Consequence by Treatment") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.title = element_text(face = "bold"),
    legend.position = "top"
  )

# map categorical variables to numerical values
mental_health3$family_history_num <- ifelse(mental_health3$family_history == "No", 0, 1)
mental_health3$treatment_num <- ifelse(mental_health3$treatment == "No", 0, 1)
mental_health3$self_employed_num <- ifelse(mental_health3$self_employed == "No", 0, 1)
mental_health3$remote_work_num <- ifelse(mental_health3$remote_work == "No", 0, 1)

# convert categorical variables with multiple categories to numerical
mental_health3$benefits_num <- ifelse(mental_health2$benefits == "No", 0, ifelse(mental_health2$benefits == "Yes", 1, 2))
mental_health3$wellness_programs_num <- ifelse(mental_health2$wellness_program == "No", 0, ifelse(mental_health3$wellness_program == "Yes", 1, 2))
mental_health3$seek_help_num <- ifelse(mental_health3$seek_help == "No", 0, ifelse(mental_health3$seek_help == "Yes", 1, 2))
mental_health3$anonymity_num <- ifelse(mental_health3$anonymity == "No", 0, ifelse(mental_health3$anonymity == "Yes", 1, 2))
mental_health3$mental_health_consequence_num <- ifelse(mental_health3$mental_health_consequence == "No", 0, ifelse(mental_health3$mental_health_consequence == "Yes", 1, 2))
mental_health3$phys_health_consequence_num <- ifelse(mental_health3$phys_health_consequence == "No", 0, ifelse(mental_health3$phys_health_consequence == "Yes", 1, 2))

# select numerical columns for correlation analysis
numerical_data <- mental_health3[, c("family_history_num", "treatment_num", "self_employed_num", "remote_work_num", 
                                     "benefits_num", "wellness_programs_num", "seek_help_num", "anonymity_num", 
                                     "mental_health_consequence_num", "phys_health_consequence_num")]

# create correlation matrix
correlation_matrix <- cor(numerical_data)

# plot the correlation heatmap with ggcorrplot
ggcorrplot(correlation_matrix, hc.order = TRUE, 
           type = "lower", lab = TRUE, lab_size = 3,
           method = "circle", outline.color = "white",
           colors = c("blue", "white", "red"), 
           title = "Correlation Heatmap of Numerical Variables")

# create correlation matrix
correlation_matrix <- cor(numerical_data)

# melt the correlation matrix into long format
correlation_melted <- melt(correlation_matrix)

ggplot(correlation_melted, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(label = round(value, 2)), size = 3) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
                       limits = c(-1, 1), na.value = "grey50") +
  labs(title = "Correlation Heatmap of Numerical Variables") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"))  # rotate x-axis labels

# quantitative analysis

# convert categorical variables to factors
mental_health2$treatment <- as.factor(mental_health2$treatment)
mental_health2$mental_health_consequence <- as.factor(mental_health2$mental_health_consequence)

# convert factors to numeric
mental_health2$treatment_numeric <- as.numeric(mental_health2$treatment) - 1  # Assuming 'No' = 0 and 'Yes' = 1
mental_health2$mental_health_consequence_numeric <- as.numeric(mental_health2$mental_health_consequence) - 1  # Assuming 'No' = 0, 'Maybe' = 1, 'Yes' = 2

# fit the linear regression model
model <- lm(treatment_numeric ~ mental_health_consequence_numeric, data = mental_health2)
summary(model)
## 
## Call:
## lm(formula = treatment_numeric ~ mental_health_consequence_numeric, 
##     data = mental_health2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5258 -0.5075  0.4743  0.4925  0.5107 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        0.48930    0.02113  23.161   <2e-16 ***
## mental_health_consequence_numeric  0.01823    0.01845   0.988    0.323    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5002 on 1248 degrees of freedom
## Multiple R-squared:  0.0007815,  Adjusted R-squared:  -1.913e-05 
## F-statistic: 0.9761 on 1 and 1248 DF,  p-value: 0.3234